This notebook is Tom’s analysis from raw data

source("../CamProt_R/Utility.R")
library(tidyverse)
── Attaching packages ────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
✔ tibble  2.0.0     ✔ purrr   0.2.5
✔ readr   1.3.1     ✔ stringr 1.3.1
✔ tibble  2.0.0     ✔ forcats 0.3.0
── Conflicts ───────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ MSnbase::combine()       masks Biobase::combine(), BiocGenerics::combine(), dplyr::combine()
✖ S4Vectors::expand()      masks tidyr::expand()
✖ dplyr::filter()          masks stats::filter()
✖ S4Vectors::first()       masks dplyr::first()
✖ dplyr::lag()             masks stats::lag()
✖ BiocGenerics::Position() masks ggplot2::Position(), base::Position()
✖ purrr::reduce()          masks MSnbase::reduce()
✖ S4Vectors::rename()      masks dplyr::rename()
infile <- "Input/OOPS_qLOPIT_LabelFree_PeptideGroups_parsed.txt"
samples_inf <- "Input/samples.tsv"
peptides_df <- parse_features(infile, silac=FALSE, TMT=FALSE, level="peptide", filter_crap=TRUE)
Tally of features at each stage:
29067   All features with a PSM
These features are associated with 2597 master proteins
27490   Excluding features without a master protein
These features are associated with 2596 master proteins
26424   Excluding features without a unique master protein
These features are associated with 2311 master proteins
26254   Excluding features matching a cRAP protein
These features are associated with 2301 master proteins
25987   Excluding features associated with a cRAP protein
These features are associated with 2283 master proteins
colnames(peptides_df)
 [1] "Checked"                           "Confidence"                        "Sequence"                         
 [4] "Modifications"                     "Qvality.PEP"                       "Qvality.q.value"                  
 [7] "Number.of.Protein.Groups"          "Number.of.Proteins"                "Number.of.PSMs"                   
[10] "Master.Protein.Accessions"         "Number.of.Missed.Cleavages"        "Theo.MHplus.in.Da"                
[13] "Found.in.Sample.in.S1.F1.Sample"   "Found.in.Sample.in.S2.F2.Sample"   "Found.in.Sample.in.S3.F3.Sample"  
[16] "Found.in.Sample.in.S4.F4.Sample"   "Found.in.Sample.in.S6.F6.Sample"   "Found.in.Sample.in.S5.F5.Sample"  
[19] "Found.in.Sample.in.S7.F7.Sample"   "Found.in.Sample.in.S8.F8.Sample"   "Found.in.Sample.in.S9.F9.Sample"  
[22] "Found.in.Sample.in.S10.F10.Sample" "Found.in.Sample.in.S11.F11.Sample" "Found.in.Sample.in.S12.F12.Sample"
[25] "Found.in.Sample.in.S13.F13.Sample" "Found.in.Sample.in.S14.F14.Sample" "Found.in.Sample.in.S15.F15.Sample"
[28] "Found.in.Sample.in.S16.F16.Sample" "Found.in.Sample.in.S17.F17.Sample" "Found.in.Sample.in.S18.F18.Sample"
[31] "Found.in.Sample.in.S19.F19.Sample" "Found.in.Sample.in.S20.F20.Sample" "Area.F1.Sample"                   
[34] "Area.F2.Sample"                    "Area.F3.Sample"                    "Area.F4.Sample"                   
[37] "Area.F6.Sample"                    "Area.F5.Sample"                    "Area.F7.Sample"                   
[40] "Area.F8.Sample"                    "Area.F9.Sample"                    "Area.F10.Sample"                  
[43] "Area.F11.Sample"                   "Area.F12.Sample"                   "Area.F13.Sample"                  
[46] "Area.F14.Sample"                   "Area.F15.Sample"                   "Area.F16.Sample"                  
[49] "Area.F17.Sample"                   "Area.F18.Sample"                   "Area.F19.Sample"                  
[52] "Area.F20.Sample"                   "XCorr.Sequest.HT"                  "Confidence.Sequest.HT"            
[55] "Percolator.q.Value.Sequest.HT"     "Percolator.PEP.Sequest.HT"         "master_protein"                   
[58] "protein_length"                    "protein_description"               "peptide_start"                    
[61] "peptide_end"                       "crap_protein"                      "associated_crap_protein"          
[64] "unique"                            "filename"                         
peptides_quant <- makeMSNSet(obj=peptides_df, samples_inf=samples_inf, ab_col_ix=2, level="peptide", quant_name="Area")

tallies for missing data (# samples with missing)
   0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16   17   18   19   20 
  16   35   98   99  100  119  141  144  224  278  327  386  517  684  930 1238 1696 2517 3981 8158 4299 

4299 peptides do not have any quantification values

So lots and lots of missing values! most peptides are quantified in only a very minor subset of fractions (<5/20). This is no suprise since we’re dealing with LFQ and subcellular fractionation here.

plotMissing(peptides_quant)
Out of 21688 total features, 21672 (99.926%) have missing values

   1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16   17   18   19 
  35   98   99  100  119  141  144  224  278  327  386  517  684  930 1238 1696 2517 3981 8158 
And 21637 features have more than one missing value

What about if we subset to GO annotated RBPs?

human_go <- readRDS("./Input/h_sapiens_go_full.rds")
GO_RBPs <- human_go %>% filter(GO.ID=="GO:0003723") %>% pull(UNIPROTKB)
peptides_quant[fData(peptides_quant)$master_protein %in% GO_RBPs,] %>% plotMissing()
Out of 11023 total features, 11012 (99.9%) have missing values

   1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16   17   18   19 
  26   79   76   78   87  100  106  162  200  228  247  327  397  531  671  894 1253 1844 3706 
And 10986 features have more than one missing value

Ok, so this doesn’t make any difference, 99.9% have a missing value!

Let’s aggregate to the unique peptide sequence level (ignorning variable modifications) and then see whether that reduces our problem

peptides_no_mod_quant <- agg_to_peptides(peptides_quant)
Your data contains missing values. Please read the relevant section in the combineFeatures manual page for
details the effects of missing values on data aggregation.

Nope, not really! We still have 19304 features (previously 21688) and 99.9% have missing values!

plotMissing(peptides_no_mod_quant)
Out of 19304 total features, 19288 (99.917%) have missing values

   1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16   17   18   19 
  37  109  102   94  133  146  149  221  298  344  396  526  691  907 1204 1649 2389 3527 6366 
And 19251 features have more than one missing value

OK, so we’re going to have to impute. Note that the missing valuesa are particularly present in the first 2 fractions and across fractions 3-7. After that we see fewer missing values. Also, remember that we have yet to identify any definite set of RNAs from the initial fractions (1-7ish). For this reason, we’ll remove these first 5 fractions for now and leave fraction 6 & 7 as these are useful to separate light membranes

plot(colSums(is.na(exprs(peptides_no_mod_quant))))

#peptides_no_mod_quant_no_lm <- peptides_no_mod_quant[,8:20]
peptides_no_mod_quant_no_lm <- peptides_no_mod_quant[,6:20]
plotMissing(peptides_no_mod_quant_no_lm)
Out of 19304 total features, 19139 (99.145%) have missing values

   1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
 148  196  227  277  304  338  485  603  808 1070 1564 2371 3506 6476  766 
And 18991 features have more than one missing value

Ok, so now we’re down to just 99.1% missing :p but at least we have some more peptides now with relatively few (<=4 missing values)

If we focus on those peptides with 4-9 missing values, we can see that many are missing in a block of sequential fractions. Arguably, these are true missing values, e.g not at random.

missing_n <- rowSums(is.na(exprs(peptides_no_mod_quant_no_lm)))
peptides_no_mod_quant_no_lm[(missing_n>=4 & missing_n<=9),] %>% plotMissing()
Out of 2815 total features, 2815 (100%) have missing values

  4   5   6   7   8   9 
277 304 338 485 603 808 
And 2815 features have more than one missing value

We can identify the missing values which are in a sequential block of >=5 fractions in a row and replace these with zero

First, let’s make a function to identify rows of values where the missing values are not random, e.g 4 or more in a row

test_values_1 <- c(1,1,NA,1,NA,NA,1,1,1,1)
test_values_2 <- c(1,1,NA,NA,NA,NA,NA,1,1,1)
test_values_3 <- c(NA,NA,NA,NA,1,1,1,1,1)
is_not_at_random <- function(x){
  with(rle(is.na(x)), sum(lengths[values] >= 4))
}
is_not_at_random(test_values_1)
[1] 0
is_not_at_random(test_values_2)
[1] 1
is_not_at_random(test_values_3)
[1] 1

Now, let’s check this identifies the peptides which are not missing at random. First, we’ll remove those without at least 4 quantification values.

peptides_no_mod_quant_4 <- peptides_no_mod_quant_no_lm[missing_n<=(ncol(peptides_no_mod_quant_no_lm)-4),]
missing_not_at_random <- apply(exprs(peptides_no_mod_quant_4), 1, is_not_at_random)
peptides_no_mod_quant_4[missing_not_at_random==1,] %>% plotMissing()
Out of 3846 total features, 3846 (100%) have missing values

   4    5    6    7    8    9   10   11 
  16   66  127  290  421  682  956 1288 
And 3846 features have more than one missing value

peptides_no_mod_quant_4[missing_not_at_random==2,] %>% plotMissing()
Out of 376 total features, 376 (100%) have missing values

  8   9  10  11 
  4  36  74 262 
And 376 features have more than one missing value

Now, let’s extend the function to replace the blocks of missing values with zero

test_values_1 <- c(1,1,NA,1,NA,NA,1,1,1,1)
test_values_2 <- c(1,1,NA,NA,NA,NA,NA,1,1,1)
test_values_3 <- c(NA,NA,NA,NA,1,1,1,1,1, NA)
rle(is.na(test_values_1))$values
[1] FALSE  TRUE FALSE  TRUE FALSE
rle(is.na(test_values_1))$lengths
[1] 2 1 1 2 4
replace_missing_not_at_random <- function(x){
  missing_rle <- rle(is.na(x))
  
  sequential_blocks <- missing_rle$values
  sequential_blocks[missing_rle$lengths<4] <- FALSE
  replace_with_zero <- rep(sequential_blocks, missing_rle$lengths)
  
  out <- x
  out[replace_with_zero]<-0
  
  return(out)
}
replace_missing_not_at_random(test_values_1)
 [1]  1  1 NA  1 NA NA  1  1  1  1
replace_missing_not_at_random(test_values_2)
 [1] 1 1 0 0 0 0 0 1 1 1
replace_missing_not_at_random(test_values_3)
 [1]  0  0  0  0  1  1  1  1  1 NA

Below we impute a maximum of 10 missing values in sequential blocks with zeros

missing_n2 <- rowSums(is.na(exprs(peptides_no_mod_quant_4)))
peptides_no_mod_quant_4_mnar_zero <- peptides_no_mod_quant_4[missing_n2<=10,]
exprs(peptides_no_mod_quant_4_mnar_zero) <- exprs(peptides_no_mod_quant_4_mnar_zero) %>%
  apply(1, replace_missing_not_at_random) %>% t()

Re-check the missing values

plotMissing(peptides_no_mod_quant_4)
Out of 6185 total features, 6020 (97.332%) have missing values

   1    2    3    4    5    6    7    8    9   10   11 
 148  196  227  277  304  338  485  603  808 1070 1564 
And 5872 features have more than one missing value

plotMissing(peptides_no_mod_quant_4_mnar_zero)
Out of 4621 total features, 4009 (86.756%) have missing values

  1   2   3   4   5   6   7   8   9  10 
668 789 734 593 429 293 195 178  90  40 
And 3341 features have more than one missing value

peptides_no_mod_quant_4_mnar_zero_mar_knn <- peptides_no_mod_quant_4_mnar_zero
imputed_zeros <- rowSums(exprs(peptides_no_mod_quant_4_mnar_zero_mar_knn)==0, na.rm=TRUE)
missing_n3 <- rowSums(is.na(exprs(peptides_no_mod_quant_4_mnar_zero_mar_knn)))
fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$zero_imputation <- imputed_zeros>0 
fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$zero_imputation_n <- imputed_zeros
fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$missing <- missing_n3>0 
fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$missing_n <- missing_n3
peptides_no_mod_quant_4_mnar_zero_mar_knn <- peptides_no_mod_quant_4_mnar_zero_mar_knn[missing_n3<=3,]
print(table(fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$zero_imputation))

FALSE  TRUE 
  736  2067 
print(table(fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$missing))

FALSE  TRUE 
  612  2191 
print(table(fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$missing,
            fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$zero_imputation))
       
        FALSE TRUE
  FALSE   165  447
  TRUE    571 1620
dim(peptides_no_mod_quant_4_mnar_zero)
[1] 4621   15
dim(peptides_no_mod_quant_4_mnar_zero_mar_knn)
[1] 2803   15
peptides_no_mod_quant_4_mnar_zero_mar_knn <- suppressMessages(impute(peptides_no_mod_quant_4_mnar_zero_mar_knn, "knn", k = 10))
Cluster size 2803 broken into 2633 170 
Cluster size 2633 broken into 2177 456 
Cluster size 2177 broken into 252 1925 
Done cluster 252 
Cluster size 1925 broken into 1390 535 
Done cluster 1390 
Done cluster 535 
Done cluster 1925 
Done cluster 2177 
Done cluster 456 
Done cluster 2633 
Done cluster 170 
p <- plotLabelQuant(peptides_no_mod_quant_no_lm, log=TRUE)

p <- plotLabelQuant(peptides_no_mod_quant_4_mnar_zero_mar_knn, log=TRUE)

p <- peptides_no_mod_quant_4_mnar_zero_mar_knn[
  fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$missing,] %>% plotLabelQuant(log=TRUE)

p <- peptides_no_mod_quant_4_mnar_zero_mar_knn[
  !fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$missing,] %>% plotLabelQuant(log=TRUE)

protein_quant <- agg_to_protein(peptides_no_mod_quant_4_mnar_zero_mar_knn)
print(dim(protein_quant))
[1] 733  15
source("~/git_repos/CamProt_R/LOPIT.R")
markers_df <- read.delim("./Input//markers_9B_hyperLOPIT_vs_DC.csv", sep=",", header=FALSE, stringsAsFactors=FALSE)[,1:2]
markers_df$V2 <- recode(markers_df$V2, "RIBOSOME 40S"="RIBOSOME", "RIBOSOME 60S"="RIBOSOME")
markers_proteins <- markers_df$V2
names(markers_proteins) <- markers_df$V1
protein_quant_am <- addMarkers(normalise(protein_quant, "sum"), markers_proteins)
Markers in data: 123 out of 733
organelleMarkers
          CYTOSOL                ER             GOLGI          LYSOSOME      MITOCHONDRIA           NUCLEUS 
                8                21                 4                 2                 5                16 
NUCLEUS-CHROMATIN        PEROXISOME                PM        PROTEASOME          RIBOSOME           unknown 
                6                 1                18                 3                39               610 
p <- plotHexbin(protein_quant_am, "markers")
print(p)

marker_classes <- getMarkerClasses(protein_quant_am, "markers")
m_colours <- getColours(marker_classes)
p <- plotPCA(protein_quant_am, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order)
the condition has length > 1 and only the first element will be used
print(p)
$p
ggsave("./Output/plots/pca_1_2_inc_glycoproteins.png")
Saving 7.29 x 4.5 in image

p <- plotPCA(protein_quant_am, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order, dims=c(3,4))
the condition has length > 1 and only the first element will be used
print(p)
$p

glycoproteins <- read.delim("./Input/glycoproteins.tsv")$protein

Copied from Manasa’s oopsFinal.Rmd notebook

# We will add a bit more information to the fData file 
# 1. List of known RBPs across cell lines in the XRNAX paper (Table S2)
xrnax = read.delim("./Input/xrnax-genelist.txt",sep="\t",header=T)
xrnax.rbps = xrnax %>% 
              dplyr::filter(!is.na(MCF7.RBP) | !is.na(HEK293.RBP) | !is.na(HeLa.RBP)) %>% 
              dplyr::select(Uniprot.ID:Protein.name)
rownames(xrnax.rbps) = xrnax.rbps$Uniprot.ID
print(length(rownames(xrnax.rbps)))
[1] 1753
# Check how many are common to the cell lines in the XRNAX paper
xrnax %>% 
  dplyr::select(MCF7.RBP:ihRBP) %>%
  apply(2, table,useNA="always")
            MCF7.RBP HEK293.RBP HeLa.RBP ihRBP
non-poly(A)      617        698      565   775
poly(A)          590        659      674   978
<NA>            1276       1126     1244   730
# 2. List of RBPs from SILAC experiments using OOPS after wash step2 (Table S1)
oops = read.delim("./Input/oops-genelist.txt",sep="\t",header=T)
oops.rbps = oops %>% 
              dplyr::filter(CL_NC_Ratio >= 1.0) %>% 
              dplyr::select(master_protein, RBP_glyco)
oops_rbps <- unique(oops.rbps$master_protein)
print(length(oops_rbps))
[1] 405
protein_quant_am_no_glyco <- protein_quant_am[!rownames(protein_quant_am) %in% glycoproteins,]
fData(protein_quant_am_no_glyco)$oops <- rownames(protein_quant_am_no_glyco) %in% oops_rbps
fData(protein_quant_am_no_glyco)$xrnax <- rownames(protein_quant_am_no_glyco) %in% rownames(xrnax.rbps)
fData(protein_quant_am_no_glyco)$go_rbp <- rownames(protein_quant_am_no_glyco) %in% GO_RBPs
print(table(fData(protein_quant_am_no_glyco)$oops, fData(protein_quant_am_no_glyco)$xrnax))
       
        FALSE TRUE
  FALSE   176  198
  TRUE     21  239
print(table(fData(protein_quant_am_no_glyco)$oops, fData(protein_quant_am_no_glyco)$xrnax,
            fData(protein_quant_am_no_glyco)$go_rbp))
, ,  = FALSE

       
        FALSE TRUE
  FALSE   149  102
  TRUE     16   31

, ,  = TRUE

       
        FALSE TRUE
  FALSE    27   96
  TRUE      5  208
print(dim(protein_quant_am))
[1] 733  15
print(dim(protein_quant_am_no_glyco))
[1] 634  15
marker_classes <- getMarkerClasses(protein_quant_am_no_glyco, "markers")
m_colours <- getColours(marker_classes)
p <- plotPCA(protein_quant_am_no_glyco, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order)
the condition has length > 1 and only the first element will be used
print(p)
$p
ggsave("./Output/plots/pca_1_2.png")
Saving 7.29 x 4.5 in image

p <- plotPCA(protein_quant_am_no_glyco, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order, dims=c(3,4))
the condition has length > 1 and only the first element will be used
print(p)
$p
ggsave("./Output/plots/pca_3_4.png")
Saving 7.29 x 4.5 in image

protein_quant_am_no_glyco_yes_rbp <- protein_quant_am_no_glyco[
  (fData(protein_quant_am_no_glyco)$oops |
     fData(protein_quant_am_no_glyco)$xrnax |
    fData(protein_quant_am_no_glyco)$go_rbp),]
p <- plotPCA(protein_quant_am_no_glyco_yes_rbp, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order)
the condition has length > 1 and only the first element will be used
print(p)
$p
ggsave("./Output/plots/pca_1_2_no_glyco_only_rbps.png")
Saving 7.29 x 4.5 in image

p <- plotPCA(protein_quant_am_no_glyco_yes_rbp, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order, dims=c(3,4))
the condition has length > 1 and only the first element will be used
print(p)
$p
ggsave("./Output/plots/pca_3_4_no_glyco_only_rbps.png")
Saving 7.29 x 4.5 in image

translocon <- human_go %>% filter(GO.ID=='GO:0071256') %>% pull(UNIPROTKB)
para <- human_go %>% filter(GO.ID=='GO:0042382') %>% pull(UNIPROTKB)
 
p <- plotPCA(protein_quant_am_no_glyco_yes_rbp, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order, foi=translocon)
the condition has length > 1 and only the first element will be used
print(p)
$p

$p_foi

ggsave("./Output/plots/pca_1_2_no_glyco_only_rbps_translocon.png")
Saving 7.29 x 4.5 in image

print(dim(protein_quant_am_no_glyco_yes_rbp))
[1] 485  15
p <- plotPCA(protein_quant_am_no_glyco_yes_rbp, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order, foi=para)
the condition has length > 1 and only the first element will be used
print(p)
$p

$p_foi

ggsave("./Output/plots/pca_1_2_no_glyco_only_rbps_paraspeckles.png")
Saving 7.29 x 4.5 in image

p <- PlotMarkerProfiles(protein_quant_am_no_glyco_yes_rbp, "markers", keep_markers=marker_classes, organelle_order=organelle_order) +
  facet_wrap(~markers) +
  scale_colour_manual(values=m_colours, guide=FALSE) +
  theme(legend.position="none")
Scale for 'colour' is already present. Adding another scale for 'colour', which will replace the existing scale.
print(p)

ggsave("./Output/plots/marker_profiles.png")
Saving 10 x 10 in image
p <- PlotMarkerProfiles(protein_quant_am_no_glyco_yes_rbp, "markers", keep_markers=marker_classes,
                        organelle_order=organelle_order, unknown=TRUE) +
  facet_grid(zero_imputation_n~missing_n) +
  scale_colour_manual(values=m_colours, guide=FALSE) +
  theme(legend.position="none")
Scale for 'colour' is already present. Adding another scale for 'colour', which will replace the existing scale.
print(p)

ggsave("./Output/plots/marker_profiles_imputation.png")
Saving 10 x 10 in image

Now we make a new set of markers designed to highlight functional groups of RBPs more usefully. First we need to define new sets of GO annotation proteins, where each marker belongs to only one group

translocon <- human_go %>% filter(GO.ID=='GO:0071256') %>% pull(UNIPROTKB)
para <- human_go %>% filter(GO.ID=='GO:0042382') %>% pull(UNIPROTKB)
mrna_splicing <- human_go %>% filter(GO.ID=='GO:0000398') %>% pull(UNIPROTKB)
translation_init <- human_go %>% filter(GO.ID=='GO:0006413') %>% pull(UNIPROTKB)
translation_init <- setdiff(translation_init, names(markers_proteins)[markers_proteins=="RIBOSOME"])
cell_cell_adhesion <- human_go %>% filter(GO.ID=='GO:0098609') %>% pull(UNIPROTKB)
cytoskeleton <- human_go %>% filter(GO.ID=='GO:0005856') %>% pull(UNIPROTKB)
motor_activity <- human_go %>% filter(GO.ID=='GO:0003774') %>% pull(UNIPROTKB)
er_stress_response <- human_go %>% filter(GO.ID=='GO:0030968') %>% pull(UNIPROTKB)
nuclear_pore_channel <- human_go %>% filter(GO.ID=='GO:0044613') %>% pull(UNIPROTKB)
nuclear_pore_basket <- human_go %>% filter(GO.ID=='GO:0044615') %>% pull(UNIPROTKB)
tRNA_AA <- human_go %>% filter(GO.ID=='GO:0004812') %>% pull(UNIPROTKB)
#mrna_splicing <- setdiff(mrna_splicing, c(para, translation_init, cell_cell_adhesion, cytoskeleton, motor_activity))
#translation_init <- setdiff(translation_init, c(para, cell_cell_adhesion, cytoskeleton, motor_activity))
#cytoskeleton <- setdiff(cytoskeleton, c(para, cell_cell_adhesion, motor_activity))
#cell_cell_adhesion <- setdiff(cell_cell_adhesion, c(para, motor_activity))
#all_markers <- c(mrna_splicing, para, translocon, translation_init, cell_cell_adhesion, cytoskeleton, motor_activity)
#print(length(all_markers)==length(unique(all_markers)))
mrna_splicing <- setdiff(mrna_splicing, tRNA_AA)
all_markers <- c(mrna_splicing, tRNA_AA)
print(length(all_markers)==length(unique(all_markers)))
[1] TRUE
rbps_markers <- markers_proteins
localisations_to_remove <- c("PEROXISOME", "PROTEASOME", "GOLGI", "LYSOSOME")
rbps_markers <- rbps_markers[!rbps_markers %in% localisations_to_remove]
rbps_markers[rbps_markers =="NUCLEUS-CHROMATIN"] <- "NUCLEUS"
print(table(rbps_markers))
rbps_markers
     CYTOSOL           ER MITOCHONDRIA      NUCLEUS           PM     RIBOSOME 
          96          118          197          171          295           72 
new_markers <- c(#rep("PARASPECKLES", length(para)),
                 rep("mRNA splicing", length(mrna_splicing)),
                 rep("Aminoacyl-tRNA ligase", length(tRNA_AA)),
                 "PARP1"#,
                 #rep("TRANSLATION INITITAION", length(translation_init)),
                 #rep("CELL-CELL ADHESION", length(cell_cell_adhesion)),
                 #rep("MOTOR", length(motor_activity))#,
                 #rep("CYTOSKELETON", length(cytoskeleton))
                 )
names(new_markers) <- c(#para,
                        mrna_splicing,
                        tRNA_AA,
                        "P09874"#,
                        #translation_init,
                        #cell_cell_adhesion,
                        #motor_activity#,
                        #cytoskeleton
                        )
print(table(names(new_markers))[table(names(new_markers))>1])
named integer(0)
rbps_markers <- rbps_markers[!names(rbps_markers) %in% names(new_markers)]
combined_markers <- c(rbps_markers, new_markers)
print(table(combined_markers))
combined_markers
Aminoacyl-tRNA ligase               CYTOSOL                    ER          MITOCHONDRIA         mRNA splicing 
                   42                    93                   118                   193                   321 
              NUCLEUS                 PARP1                    PM              RIBOSOME 
                  149                     1                   295                    72 
print(table(names(combined_markers))[table(names(combined_markers))>1])
named integer(0)
fData(protein_quant_am_no_glyco_yes_rbp)$new_markers <- NULL
protein_quant_am_no_glyco_yes_rbp <- addMarkers(protein_quant_am_no_glyco_yes_rbp, combined_markers, "new_markers")
Markers in data: 168 out of 485
organelleMarkers
Aminoacyl-tRNA ligase               CYTOSOL                    ER          MITOCHONDRIA         mRNA splicing 
                   13                     5                     7                     2                    86 
              NUCLEUS                 PARP1                    PM              RIBOSOME               unknown 
                   12                     1                     3                    39                   317 
fData(protein_quant_am_no_glyco_yes_rbp)$new_markers <- recode(
  fData(protein_quant_am_no_glyco_yes_rbp)$new_markers, "NUCLEUS"="Nucleus", "RIBOSOME"="Ribosome", "CYTOSOL"="Cytosol",
  "MITOCHONDRIA"="Mitochondria") 
print(table(fData(protein_quant_am_no_glyco_yes_rbp)$new_markers))

Aminoacyl-tRNA ligase               Cytosol                    ER          Mitochondria         mRNA splicing 
                   13                     5                     7                     2                    86 
              Nucleus                 PARP1                    PM              Ribosome               unknown 
                   12                     1                     3                    39                   317 

After adding these new RBP markers, we only have 1 PM and 2 Mt proteins remaining. Let’s remove the PM protein

fData(protein_quant_am_no_glyco_yes_rbp)$new_markers[fData(protein_quant_am_no_glyco_yes_rbp)$new_markers=="PM"] <- "unknown"
new_markers_levels <- getMarkerClasses(protein_quant_am_no_glyco_yes_rbp, "new_markers")
p <- plotPCA(protein_quant_am_no_glyco_yes_rbp, "new_markers", re_order_markers=TRUE, marker_levels=new_markers_levels,
        m_colours=getClassColours()[c(1:5, 7, 10:20)], foi=nuclear_pore_channel)
the condition has length > 1 and only the first element will be used
print(p)
$p

$p_foi

p <- plotPCA(protein_quant_am_no_glyco_yes_rbp, "new_markers", re_order_markers=TRUE, marker_levels=new_markers_levels,
        m_colours=getClassColours()[c(1:5, 7, 10:20)], foi=nuclear_pore_basket)
the condition has length > 1 and only the first element will be used
print(p)
$p

$p_foi

getMarkerClasses(protein_quant_am_no_glyco_yes_rbp, "new_markers")
[1] "Aminoacyl-tRNA ligase" "Cytosol"               "ER"                    "Mitochondria"         
[5] "mRNA splicing"         "Nucleus"               "PARP1"                 "Ribosome"             
set.seed(1)
proj <- make_proj("t-SNE", protein_quant_am_no_glyco_yes_rbp, "new_markers")
library(Hmisc)
Loading required package: lattice
Loading required package: survival

Attaching package: ‘survival’

The following object is masked from ‘package:robustbase’:

    heart

Loading required package: Formula

Attaching package: ‘Hmisc’

The following object is masked from ‘package:AnnotationDbi’:

    contents

The following objects are masked from ‘package:MSnbase’:

    impute, naplot

The following object is masked from ‘package:Biobase’:

    contents

The following objects are masked from ‘package:dplyr’:

    src, summarize

The following objects are masked from ‘package:base’:

    format.pval, units
proj_df <- proj$PCA_df
marker_levels <- setdiff(new_markers_levels, "unknown")
marker_levels <- marker_levels[c(2,3,4,6,8,1,5,7)]
#m_colours <- getStockcol()[c(1,7,4,3,2,5)]
m_colours <- c(cbPalette[c(6,3,2,4,8,7,5)], "grey20")
proj_df$markers <- factor(proj_df$new_markers, levels=marker_levels)
print(table(is.na(proj_df$markers)))

FALSE  TRUE 
  165   320 
proj_df$unknown <- proj_df$new_markers=="unknown"
p <- ggplot(proj_df, aes(X, Y, colour=markers)) +
    geom_point(aes(X, Y, colour=markers, alpha=unknown), size=2) +
    scale_alpha_manual(values=c(1,0.2), guide=F) +
    my_theme +
    scale_colour_manual(values=m_colours, na.value="grey60", name="", breaks=c(marker_levels)) +
    guides(colour = guide_legend(override.aes = list(alpha = 1, size=2))) +
    xlab("Dimension 1") + ylab("Dimension 2")
print(p)
ggsave("./Output/tSNE_RBP_markers.png")
Saving 7.29 x 4.5 in image

write.table(proj_df, "./Output/tSNE_projections.tsv", sep="\t", quote=FALSE, row.name=FALSE)
p <- PlotMarkerProfiles(protein_quant_am_no_glyco_yes_rbp, "new_markers",
                        keep_markers=getMarkerClasses(protein_quant_am_no_glyco_yes_rbp, "new_markers"), organelle_order=new_markers_levels) +
  facet_wrap(~markers) +
  scale_colour_manual(values=getClassColours()[c(1:5, 7, 10:20)], guide=FALSE) +
  theme(legend.position="none")
Scale for 'colour' is already present. Adding another scale for 'colour', which will replace the existing scale.
print(p)

protein_info <- read.delim("Input/Aggregated-proteins-2187-with-uniprot.tab") %>%
  dplyr::select(Entry, gene_name=Gene.names...primary..)
total.prot = readRDS("./Input/prot_res_20_fractions_imputed_markers.rds")
colnames(total.prot)[12] <- "0.948"
colnames(total.prot) <- c(seq(1,20,2), seq(2,20,2))
total.prot <- total.prot[,order(as.numeric(as.character(colnames(total.prot))))]
total.prot <- total.prot[,5:19]
total.prot = normalise(total.prot,"sum")
rbp.prot <- protein_quant_am_no_glyco_yes_rbp
colnames(rbp.prot) <- 5:19
print(dim(total.prot))
[1] 5610   15
print(dim(rbp.prot))
[1] 485  15
print(colnames(total.prot))
 [1] "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14" "15" "16" "17" "18" "19"
print(colnames(rbp.prot))
 [1] "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14" "15" "16" "17" "18" "19"
plotCombinedProfiles <- function(foi){
  
  foi_in_total <- intersect(foi, rownames(total.prot))
  foi_in_rbp <- intersect(foi, rownames(rbp.prot))
  
  foi_in_both <- intersect(foi_in_rbp, foi_in_total)
  
  print(foi_in_both)
  if(length(foi_in_both)==0){
    return(NA)
  }
  
  total_exprs_df <- exprs(total.prot[foi_in_both,])
  total_exprs_df <- melt(total_exprs_df)
  total_exprs_df$type = "All protein"
  
  rbp_exprs_df <- exprs(rbp.prot[foi_in_both,])
  rbp_exprs_df <- melt(rbp_exprs_df)
  rbp_exprs_df$type = "RNA-bound"
  
  #print(head(total_exprs_df))
  #print(head(rbp_exprs_df))
  #print(head(rbind(rbp_exprs_df, total_exprs_df)))
  
  p <- rbind(rbp_exprs_df, total_exprs_df) %>%
    merge(protein_info, by.x="Var1", by.y="Entry") %>%
    ggplot(aes(Var2, value, colour=type, group=type)) +
    my_theme + geom_line() +
    theme(aspect.ratio=1, axis.text.x=element_text(angle=90, vjust=0.5, hjust=1))  +
    xlab("Fraction") + ylab("Sumn normalised\nabundance") + 
    scale_colour_discrete(name="", na.value="grey")
  
  if(length(foi_in_both)>1){
    p <- p + facet_wrap(~gene_name)
  }
  
  print(p)
  
  p2 <- p <- rbind(rbp_exprs_df, total_exprs_df) %>%
    merge(protein_info, by.x="Var1", by.y="Entry") %>%
    ggplot(aes(Var2, value, colour=type, group=interaction(type, gene_name))) +
    my_theme + geom_line() +
    theme(aspect.ratio=1, axis.text.x=element_text(angle=90, vjust=0.5, hjust=1))  +
    xlab("Fraction") + ylab("Sumn normalised\nabundance") + 
    scale_colour_discrete(name="", na.value="grey")
  
  print(p2)
  
  invisible(list("p"=p, "p2"=p2))
}
plotCombinedProfiles(tRNA_AA)
 [1] "Q9NSE4" "P41252" "P54577" "P49589" "P49591" "P14868" "O43776" "P07814" "P47897" "P26639" "P26640" "P49588"
[13] "P41250"

#plotCombinedProfiles(para[[2]])
plotCombinedProfiles(motor_activity)
[1] "Q14203" "P33176" "O00159" "Q14204" "P35579" "P52732" "Q13409" "P35580" "P60660"

fvarLabels(rbp.prot)
 [1] "Checked"                       "Confidence"                    "Sequence"                     
 [4] "Modifications"                 "Qvality.PEP"                   "Qvality.q.value"              
 [7] "Number.of.Protein.Groups"      "Number.of.Proteins"            "Number.of.PSMs"               
[10] "Master.Protein.Accessions"     "Number.of.Missed.Cleavages"    "Theo.MHplus.in.Da"            
[13] "XCorr.Sequest.HT"              "Confidence.Sequest.HT"         "Percolator.q.Value.Sequest.HT"
[16] "Percolator.PEP.Sequest.HT"     "master_protein"                "protein_length"               
[19] "protein_description"           "peptide_start"                 "peptide_end"                  
[22] "crap_protein"                  "associated_crap_protein"       "unique"                       
[25] "filename"                      "zero_imputation"               "zero_imputation_n"            
[28] "missing"                       "missing_n"                     "CV.F6"                        
[31] "CV.F7"                         "CV.F8"                         "CV.F9"                        
[34] "CV.F10"                        "CV.F11"                        "CV.F12"                       
[37] "CV.F13"                        "CV.F14"                        "CV.F15"                       
[40] "CV.F16"                        "CV.F17"                        "CV.F18"                       
[43] "CV.F19"                        "CV.F20"                        "markers"                      
[46] "oops"                          "xrnax"                         "go_rbp"                       
[49] "new_markers"                  
well_quantified_rbps <- fData(rbp.prot) %>%
  filter(zero_imputation_n<=4, missing_n<=2) %>%
  pull(master_protein)
plotCombinedProfiles(intersect(mrna_splicing, well_quantified_rbps))
[1] "Q01081" "Q00839" "P67809" "P11142" "O43290" "Q13247" "Q15366" "Q8IYB3"

ribosome_proteins <- names(markers_proteins)[markers_proteins=="RIBOSOME"]
plotCombinedProfiles(intersect(ribosome_proteins, well_quantified_rbps))
[1] "P15880" "P46781" "P35268" "P62750" "Q02543" "P36578" "P47914"

plotCombinedProfiles("Q15942")
[1] "Q15942"

p <- plotCombinedProfiles("P09874")
[1] "P09874"

ggsave("./Output/PARP1_profiles.png", p$p2)
Saving 7.29 x 4.5 in image

---
title: "R Notebook"
output: html_notebook
---

This notebook is Tom's analysis from raw data

```{r}
source("../CamProt_R/Utility.R")
library(tidyverse)
infile <- "Input/OOPS_qLOPIT_LabelFree_PeptideGroups_parsed.txt"
samples_inf <- "Input/samples.tsv"
```

```{r}
peptides_df <- parse_features(infile, silac=FALSE, TMT=FALSE, level="peptide", filter_crap=TRUE)
```

```{r}
colnames(peptides_df)

peptides_quant <- makeMSNSet(obj=peptides_df, samples_inf=samples_inf, ab_col_ix=2, level="peptide", quant_name="Area")

```
So lots and lots of missing values! most peptides are quantified in only a very minor subset of fractions (<5/20). This is no suprise since we're dealing with LFQ and subcellular fractionation here.
```{r}
plotMissing(peptides_quant)
```
What about if we subset to GO annotated RBPs?
```{r}
human_go <- readRDS("./Input/h_sapiens_go_full.rds")
GO_RBPs <- human_go %>% filter(GO.ID=="GO:0003723") %>% pull(UNIPROTKB)
```

```{r}
peptides_quant[fData(peptides_quant)$master_protein %in% GO_RBPs,] %>% plotMissing()
```
Ok, so this doesn't make any difference, 99.9% have a missing value!

Let's aggregate to the unique peptide sequence level (ignorning variable modifications) and then see whether that reduces our problem

```{r}
peptides_no_mod_quant <- agg_to_peptides(peptides_quant)
```

Nope, not really! We still have 19304 features (previously 21688) and 99.9% have missing values!
```{r}
plotMissing(peptides_no_mod_quant)
```

OK, so we're going to have to impute. Note that the missing valuesa are particularly present in the first 2 fractions and across fractions 3-7. After that we see fewer missing values. Also, remember that we have yet to identify any definite set of RNAs from the initial fractions (1-7ish). For this reason, we'll remove these first 5 fractions for now and leave fraction 6 & 7 as these are useful to separate light membranes
```{r}
plot(colSums(is.na(exprs(peptides_no_mod_quant))))
```


```{r}
#peptides_no_mod_quant_no_lm <- peptides_no_mod_quant[,8:20]
peptides_no_mod_quant_no_lm <- peptides_no_mod_quant[,6:20]
plotMissing(peptides_no_mod_quant_no_lm)
```
Ok, so now we're down to _just_ 99.1% missing :p but at least we have some more peptides now with relatively few (<=4 missing values)

If we focus on those peptides with 4-9 missing values, we can see that many are missing in a block of sequential fractions. Arguably, these are true missing values, e.g not at random. 
```{r}
missing_n <- rowSums(is.na(exprs(peptides_no_mod_quant_no_lm)))
peptides_no_mod_quant_no_lm[(missing_n>=4 & missing_n<=9),] %>% plotMissing()
```
We can identify the missing values which are in a sequential block of >=5 fractions in a row and replace these with zero

First, let's make a function to identify rows of values where the missing values are not random, e.g 4 or more in a row
```{r}
test_values_1 <- c(1,1,NA,1,NA,NA,1,1,1,1)
test_values_2 <- c(1,1,NA,NA,NA,NA,NA,1,1,1)
test_values_3 <- c(NA,NA,NA,NA,1,1,1,1,1)

is_not_at_random <- function(x){
  with(rle(is.na(x)), sum(lengths[values] >= 4))
}

is_not_at_random(test_values_1)
is_not_at_random(test_values_2)
is_not_at_random(test_values_3)
```

Now, let's check this identifies the peptides which are not missing at random. First, we'll remove those without at least 4 quantification values.
```{r}
peptides_no_mod_quant_4 <- peptides_no_mod_quant_no_lm[missing_n<=(ncol(peptides_no_mod_quant_no_lm)-4),]
missing_not_at_random <- apply(exprs(peptides_no_mod_quant_4), 1, is_not_at_random)
```

```{r}
peptides_no_mod_quant_4[missing_not_at_random==1,] %>% plotMissing()
peptides_no_mod_quant_4[missing_not_at_random==2,] %>% plotMissing()
```
Now, let's extend the function to replace the blocks of missing values with zero
```{r}
test_values_1 <- c(1,1,NA,1,NA,NA,1,1,1,1)
test_values_2 <- c(1,1,NA,NA,NA,NA,NA,1,1,1)
test_values_3 <- c(NA,NA,NA,NA,1,1,1,1,1, NA)


rle(is.na(test_values_1))$values
rle(is.na(test_values_1))$lengths

replace_missing_not_at_random <- function(x){
  missing_rle <- rle(is.na(x))
  
  sequential_blocks <- missing_rle$values
  sequential_blocks[missing_rle$lengths<4] <- FALSE

  replace_with_zero <- rep(sequential_blocks, missing_rle$lengths)
  
  out <- x
  out[replace_with_zero]<-0
  
  return(out)

}

replace_missing_not_at_random(test_values_1)
replace_missing_not_at_random(test_values_2)
replace_missing_not_at_random(test_values_3)
```


Below we impute a maximum of 10 missing values in sequential blocks with zeros
```{r}
missing_n2 <- rowSums(is.na(exprs(peptides_no_mod_quant_4)))
peptides_no_mod_quant_4_mnar_zero <- peptides_no_mod_quant_4[missing_n2<=10,]

exprs(peptides_no_mod_quant_4_mnar_zero) <- exprs(peptides_no_mod_quant_4_mnar_zero) %>%
  apply(1, replace_missing_not_at_random) %>% t()
```

Re-check the missing values
```{r}
plotMissing(peptides_no_mod_quant_4)
plotMissing(peptides_no_mod_quant_4_mnar_zero)
```

```{r}
peptides_no_mod_quant_4_mnar_zero_mar_knn <- peptides_no_mod_quant_4_mnar_zero

imputed_zeros <- rowSums(exprs(peptides_no_mod_quant_4_mnar_zero_mar_knn)==0, na.rm=TRUE)
missing_n3 <- rowSums(is.na(exprs(peptides_no_mod_quant_4_mnar_zero_mar_knn)))

fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$zero_imputation <- imputed_zeros>0 
fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$zero_imputation_n <- imputed_zeros

fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$missing <- missing_n3>0 
fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$missing_n <- missing_n3


peptides_no_mod_quant_4_mnar_zero_mar_knn <- peptides_no_mod_quant_4_mnar_zero_mar_knn[missing_n3<=3,]

print(table(fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$zero_imputation))
print(table(fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$missing))

print(table(fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$missing,
            fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$zero_imputation))

dim(peptides_no_mod_quant_4_mnar_zero)
dim(peptides_no_mod_quant_4_mnar_zero_mar_knn)
peptides_no_mod_quant_4_mnar_zero_mar_knn <- suppressMessages(impute(peptides_no_mod_quant_4_mnar_zero_mar_knn, "knn", k = 10))
```

```{r}
p <- plotLabelQuant(peptides_no_mod_quant_no_lm, log=TRUE)
p <- plotLabelQuant(peptides_no_mod_quant_4_mnar_zero_mar_knn, log=TRUE)
```

```{r}
p <- peptides_no_mod_quant_4_mnar_zero_mar_knn[
  fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$missing,] %>% plotLabelQuant(log=TRUE)
p <- peptides_no_mod_quant_4_mnar_zero_mar_knn[
  !fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$missing,] %>% plotLabelQuant(log=TRUE)
```

```{r}
protein_quant <- agg_to_protein(peptides_no_mod_quant_4_mnar_zero_mar_knn)
print(dim(protein_quant))
```

```{r}
source("~/git_repos/CamProt_R/LOPIT.R")
```

```{r}
markers_df <- read.delim("./Input//markers_9B_hyperLOPIT_vs_DC.csv", sep=",", header=FALSE, stringsAsFactors=FALSE)[,1:2]
markers_df$V2 <- recode(markers_df$V2, "RIBOSOME 40S"="RIBOSOME", "RIBOSOME 60S"="RIBOSOME")
markers_proteins <- markers_df$V2
names(markers_proteins) <- markers_df$V1

protein_quant_am <- addMarkers(normalise(protein_quant, "sum"), markers_proteins)

p <- plotHexbin(protein_quant_am, "markers")
print(p)
```


```{r}

marker_classes <- getMarkerClasses(protein_quant_am, "markers")
m_colours <- getColours(marker_classes)
p <- plotPCA(protein_quant_am, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order)
print(p)
ggsave("./Output/plots/pca_1_2_inc_glycoproteins.png")

p <- plotPCA(protein_quant_am, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order, dims=c(3,4))
print(p)
```

```{r}
glycoproteins <- read.delim("./Input/glycoproteins.tsv")$protein
```

Copied from Manasa's oopsFinal.Rmd notebook
```{r}
# We will add a bit more information to the fData file 
# 1. List of known RBPs across cell lines in the XRNAX paper (Table S2)
xrnax = read.delim("./Input/xrnax-genelist.txt",sep="\t",header=T)
xrnax.rbps = xrnax %>% 
              dplyr::filter(!is.na(MCF7.RBP) | !is.na(HEK293.RBP) | !is.na(HeLa.RBP)) %>% 
              dplyr::select(Uniprot.ID:Protein.name)
rownames(xrnax.rbps) = xrnax.rbps$Uniprot.ID
print(length(rownames(xrnax.rbps)))

# Check how many are common to the cell lines in the XRNAX paper
xrnax %>% 
  dplyr::select(MCF7.RBP:ihRBP) %>%
  apply(2, table,useNA="always")

# 2. List of RBPs from SILAC experiments using OOPS after wash step2 (Table S1)
oops = read.delim("./Input/oops-genelist.txt",sep="\t",header=T)
oops.rbps = oops %>% 
              dplyr::filter(CL_NC_Ratio >= 1.0) %>% 
              dplyr::select(master_protein, RBP_glyco)

oops_rbps <- unique(oops.rbps$master_protein)
print(length(oops_rbps))
```

```{r}
protein_quant_am_no_glyco <- protein_quant_am[!rownames(protein_quant_am) %in% glycoproteins,]
fData(protein_quant_am_no_glyco)$oops <- rownames(protein_quant_am_no_glyco) %in% oops_rbps
fData(protein_quant_am_no_glyco)$xrnax <- rownames(protein_quant_am_no_glyco) %in% rownames(xrnax.rbps)
fData(protein_quant_am_no_glyco)$go_rbp <- rownames(protein_quant_am_no_glyco) %in% GO_RBPs

print(table(fData(protein_quant_am_no_glyco)$oops, fData(protein_quant_am_no_glyco)$xrnax))
print(table(fData(protein_quant_am_no_glyco)$oops, fData(protein_quant_am_no_glyco)$xrnax,
            fData(protein_quant_am_no_glyco)$go_rbp))

print(dim(protein_quant_am))
print(dim(protein_quant_am_no_glyco))
```
```{r}
marker_classes <- getMarkerClasses(protein_quant_am_no_glyco, "markers")
m_colours <- getColours(marker_classes)
p <- plotPCA(protein_quant_am_no_glyco, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order)
print(p)
ggsave("./Output/plots/pca_1_2.png")

p <- plotPCA(protein_quant_am_no_glyco, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order, dims=c(3,4))
print(p)
ggsave("./Output/plots/pca_3_4.png")
```
```{r}
protein_quant_am_no_glyco_yes_rbp <- protein_quant_am_no_glyco[
  (fData(protein_quant_am_no_glyco)$oops |
     fData(protein_quant_am_no_glyco)$xrnax |
    fData(protein_quant_am_no_glyco)$go_rbp),]

p <- plotPCA(protein_quant_am_no_glyco_yes_rbp, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order)
print(p)
ggsave("./Output/plots/pca_1_2_no_glyco_only_rbps.png")


p <- plotPCA(protein_quant_am_no_glyco_yes_rbp, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order, dims=c(3,4))
print(p)
ggsave("./Output/plots/pca_3_4_no_glyco_only_rbps.png")
```

```{r}
translocon <- human_go %>% filter(GO.ID=='GO:0071256') %>% pull(UNIPROTKB)
para <- human_go %>% filter(GO.ID=='GO:0042382') %>% pull(UNIPROTKB)

 
```

```{r}
p <- plotPCA(protein_quant_am_no_glyco_yes_rbp, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order, foi=translocon)
print(p)
ggsave("./Output/plots/pca_1_2_no_glyco_only_rbps_translocon.png")
```
```{r}
print(dim(protein_quant_am_no_glyco_yes_rbp))
```

```{r}
p <- plotPCA(protein_quant_am_no_glyco_yes_rbp, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order, foi=para)
print(p)
ggsave("./Output/plots/pca_1_2_no_glyco_only_rbps_paraspeckles.png")
```



```{r, fig.width=10, fig.height=10}
p <- PlotMarkerProfiles(protein_quant_am_no_glyco_yes_rbp, "markers", keep_markers=marker_classes, organelle_order=organelle_order) +
  facet_wrap(~markers) +
  scale_colour_manual(values=m_colours, guide=FALSE) +
  theme(legend.position="none")
print(p)
ggsave("./Output/plots/marker_profiles.png")

p <- PlotMarkerProfiles(protein_quant_am_no_glyco_yes_rbp, "markers", keep_markers=marker_classes,
                        organelle_order=organelle_order, unknown=TRUE) +
  facet_grid(zero_imputation_n~missing_n) +
  scale_colour_manual(values=m_colours, guide=FALSE) +
  theme(legend.position="none")
print(p)
ggsave("./Output/plots/marker_profiles_imputation.png")
```

Now we make a new set of markers designed to highlight functional groups of RBPs more usefully. First we need to define new sets of GO annotation proteins, where each marker belongs to only one group
```{r}
translocon <- human_go %>% filter(GO.ID=='GO:0071256') %>% pull(UNIPROTKB)
para <- human_go %>% filter(GO.ID=='GO:0042382') %>% pull(UNIPROTKB)
mrna_splicing <- human_go %>% filter(GO.ID=='GO:0000398') %>% pull(UNIPROTKB)
translation_init <- human_go %>% filter(GO.ID=='GO:0006413') %>% pull(UNIPROTKB)
translation_init <- setdiff(translation_init, names(markers_proteins)[markers_proteins=="RIBOSOME"])

cell_cell_adhesion <- human_go %>% filter(GO.ID=='GO:0098609') %>% pull(UNIPROTKB)
cytoskeleton <- human_go %>% filter(GO.ID=='GO:0005856') %>% pull(UNIPROTKB)
motor_activity <- human_go %>% filter(GO.ID=='GO:0003774') %>% pull(UNIPROTKB)
er_stress_response <- human_go %>% filter(GO.ID=='GO:0030968') %>% pull(UNIPROTKB)
nuclear_pore_channel <- human_go %>% filter(GO.ID=='GO:0044613') %>% pull(UNIPROTKB)
nuclear_pore_basket <- human_go %>% filter(GO.ID=='GO:0044615') %>% pull(UNIPROTKB)
tRNA_AA <- human_go %>% filter(GO.ID=='GO:0004812') %>% pull(UNIPROTKB)

#mrna_splicing <- setdiff(mrna_splicing, c(para, translation_init, cell_cell_adhesion, cytoskeleton, motor_activity))
#translation_init <- setdiff(translation_init, c(para, cell_cell_adhesion, cytoskeleton, motor_activity))
#cytoskeleton <- setdiff(cytoskeleton, c(para, cell_cell_adhesion, motor_activity))
#cell_cell_adhesion <- setdiff(cell_cell_adhesion, c(para, motor_activity))

#all_markers <- c(mrna_splicing, para, translocon, translation_init, cell_cell_adhesion, cytoskeleton, motor_activity)
#print(length(all_markers)==length(unique(all_markers)))


mrna_splicing <- setdiff(mrna_splicing, tRNA_AA)

all_markers <- c(mrna_splicing, tRNA_AA)
print(length(all_markers)==length(unique(all_markers)))

```

```{r}

rbps_markers <- markers_proteins
localisations_to_remove <- c("PEROXISOME", "PROTEASOME", "GOLGI", "LYSOSOME")

rbps_markers <- rbps_markers[!rbps_markers %in% localisations_to_remove]
rbps_markers[rbps_markers =="NUCLEUS-CHROMATIN"] <- "NUCLEUS"

print(table(rbps_markers))


new_markers <- c(#rep("PARASPECKLES", length(para)),
                 rep("mRNA splicing", length(mrna_splicing)),
                 rep("Aminoacyl-tRNA ligase", length(tRNA_AA)),
                 "PARP1"#,
                 #rep("TRANSLATION INITITAION", length(translation_init)),
                 #rep("CELL-CELL ADHESION", length(cell_cell_adhesion)),
                 #rep("MOTOR", length(motor_activity))#,
                 #rep("CYTOSKELETON", length(cytoskeleton))
                 )

names(new_markers) <- c(#para,
                        mrna_splicing,
                        tRNA_AA,
                        "P09874"#,
                        #translation_init,
                        #cell_cell_adhesion,
                        #motor_activity#,
                        #cytoskeleton
                        )
print(table(names(new_markers))[table(names(new_markers))>1])

rbps_markers <- rbps_markers[!names(rbps_markers) %in% names(new_markers)]

combined_markers <- c(rbps_markers, new_markers)
print(table(combined_markers))
print(table(names(combined_markers))[table(names(combined_markers))>1])

fData(protein_quant_am_no_glyco_yes_rbp)$new_markers <- NULL
protein_quant_am_no_glyco_yes_rbp <- addMarkers(protein_quant_am_no_glyco_yes_rbp, combined_markers, "new_markers")

fData(protein_quant_am_no_glyco_yes_rbp)$new_markers <- recode(
  fData(protein_quant_am_no_glyco_yes_rbp)$new_markers, "NUCLEUS"="Nucleus", "RIBOSOME"="Ribosome", "CYTOSOL"="Cytosol",
  "MITOCHONDRIA"="Mitochondria") 

print(table(fData(protein_quant_am_no_glyco_yes_rbp)$new_markers))
```
After adding these new RBP markers, we only have 1 PM and 2 Mt proteins remaining. Let's remove the PM protein
```{r}
fData(protein_quant_am_no_glyco_yes_rbp)$new_markers[fData(protein_quant_am_no_glyco_yes_rbp)$new_markers=="PM"] <- "unknown"
```

```{r}
new_markers_levels <- getMarkerClasses(protein_quant_am_no_glyco_yes_rbp, "new_markers")

p <- plotPCA(protein_quant_am_no_glyco_yes_rbp, "new_markers", re_order_markers=TRUE, marker_levels=new_markers_levels,
        m_colours=getClassColours()[c(1:5, 7, 10:20)], foi=nuclear_pore_channel)
print(p)


p <- plotPCA(protein_quant_am_no_glyco_yes_rbp, "new_markers", re_order_markers=TRUE, marker_levels=new_markers_levels,
        m_colours=getClassColours()[c(1:5, 7, 10:20)], foi=nuclear_pore_basket)
print(p)

```


```{r}
getMarkerClasses(protein_quant_am_no_glyco_yes_rbp, "new_markers")

set.seed(1)
proj <- make_proj("t-SNE", protein_quant_am_no_glyco_yes_rbp, "new_markers")
```

```{r}
library(Hmisc)
```

```{r}
proj_df <- proj$PCA_df
marker_levels <- setdiff(new_markers_levels, "unknown")
marker_levels <- marker_levels[c(2,3,4,6,8,1,5,7)]
#m_colours <- getStockcol()[c(1,7,4,3,2,5)]
m_colours <- c(cbPalette[c(6,3,2,4,8,7,5)], "grey20")

proj_df$markers <- factor(proj_df$new_markers, levels=marker_levels)
print(table(is.na(proj_df$markers)))
proj_df$unknown <- proj_df$new_markers=="unknown"

p <- ggplot(proj_df, aes(X, Y, colour=markers)) +
    geom_point(aes(X, Y, colour=markers, alpha=unknown), size=2) +
    scale_alpha_manual(values=c(1,0.2), guide=F) +
    my_theme +
    scale_colour_manual(values=m_colours, na.value="grey60", name="", breaks=c(marker_levels)) +
    guides(colour = guide_legend(override.aes = list(alpha = 1, size=2))) +
    xlab("Dimension 1") + ylab("Dimension 2")

print(p)
ggsave("./Output/tSNE_RBP_markers.png")
write.table(proj_df, "./Output/tSNE_projections.tsv", sep="\t", quote=FALSE, row.name=FALSE)


```
```{r}
p <- PlotMarkerProfiles(protein_quant_am_no_glyco_yes_rbp, "new_markers",
                        keep_markers=getMarkerClasses(protein_quant_am_no_glyco_yes_rbp, "new_markers"), organelle_order=new_markers_levels) +
  facet_wrap(~markers) +
  scale_colour_manual(values=getClassColours()[c(1:5, 7, 10:20)], guide=FALSE) +
  theme(legend.position="none")
print(p)
```
```{r}

protein_info <- read.delim("Input/Aggregated-proteins-2187-with-uniprot.tab") %>%
  dplyr::select(Entry, gene_name=Gene.names...primary..)

total.prot = readRDS("./Input/prot_res_20_fractions_imputed_markers.rds")
colnames(total.prot)[12] <- "0.948"
colnames(total.prot) <- c(seq(1,20,2), seq(2,20,2))
total.prot <- total.prot[,order(as.numeric(as.character(colnames(total.prot))))]
total.prot <- total.prot[,5:19]
total.prot = normalise(total.prot,"sum")

rbp.prot <- protein_quant_am_no_glyco_yes_rbp
colnames(rbp.prot) <- 5:19

print(dim(total.prot))
print(dim(rbp.prot))

print(colnames(total.prot))
print(colnames(rbp.prot))

```



```{r}
plotCombinedProfiles <- function(foi){
  
  foi_in_total <- intersect(foi, rownames(total.prot))
  foi_in_rbp <- intersect(foi, rownames(rbp.prot))
  
  foi_in_both <- intersect(foi_in_rbp, foi_in_total)
  
  print(foi_in_both)
  if(length(foi_in_both)==0){
    return(NA)
  }
  
  total_exprs_df <- exprs(total.prot[foi_in_both,])
  total_exprs_df <- melt(total_exprs_df)
  total_exprs_df$type = "All protein"
  
  rbp_exprs_df <- exprs(rbp.prot[foi_in_both,])
  rbp_exprs_df <- melt(rbp_exprs_df)
  rbp_exprs_df$type = "RNA-bound"
  
  #print(head(total_exprs_df))
  #print(head(rbp_exprs_df))
  #print(head(rbind(rbp_exprs_df, total_exprs_df)))
  
  p <- rbind(rbp_exprs_df, total_exprs_df) %>%
    merge(protein_info, by.x="Var1", by.y="Entry") %>%
    ggplot(aes(Var2, value, colour=type, group=type)) +
    my_theme + geom_line() +
    theme(aspect.ratio=1, axis.text.x=element_text(angle=90, vjust=0.5, hjust=1))  +
    xlab("Fraction") + ylab("Sumn normalised\nabundance") + 
    scale_colour_discrete(name="", na.value="grey")
  
  if(length(foi_in_both)>1){
    p <- p + facet_wrap(~gene_name)
  }
  
  print(p)
  
  p2 <- p <- rbind(rbp_exprs_df, total_exprs_df) %>%
    merge(protein_info, by.x="Var1", by.y="Entry") %>%
    ggplot(aes(Var2, value, colour=type, group=interaction(type, gene_name))) +
    my_theme + geom_line() +
    theme(aspect.ratio=1, axis.text.x=element_text(angle=90, vjust=0.5, hjust=1))  +
    xlab("Fraction") + ylab("Sumn normalised\nabundance") + 
    scale_colour_discrete(name="", na.value="grey")
  
  print(p2)
  
  invisible(list("p"=p, "p2"=p2))
}
```

```{r, fig.width=10, fig.height=10}
plotCombinedProfiles(tRNA_AA)
#plotCombinedProfiles(para[[2]])
```

```{r, fig.width=10, fig.height=10}
plotCombinedProfiles(motor_activity)
```
```{r}
fvarLabels(rbp.prot)
well_quantified_rbps <- fData(rbp.prot) %>%
  filter(zero_imputation_n<=4, missing_n<=2) %>%
  pull(master_protein)

plotCombinedProfiles(intersect(mrna_splicing, well_quantified_rbps))
```
```{r}
ribosome_proteins <- names(markers_proteins)[markers_proteins=="RIBOSOME"]
plotCombinedProfiles(intersect(ribosome_proteins, well_quantified_rbps))
```
```{r}
plotCombinedProfiles("Q15942")
```
```{r}
p <- plotCombinedProfiles("P09874")

ggsave("./Output/PARP1_profiles.png", p$p2)

```

